1 /*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.net;
18
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.base.Preconditions.checkNotNull;
21 import static com.google.common.base.Preconditions.checkState;
22
23 import com.google.common.annotations.Beta;
24 import com.google.common.annotations.GwtCompatible;
25 import com.google.common.base.Ascii;
26 import com.google.common.base.CharMatcher;
27 import com.google.common.base.Joiner;
28 import com.google.common.base.Splitter;
29 import com.google.common.collect.ImmutableList;
30 import com.google.thirdparty.publicsuffix.PublicSuffixPatterns;
31
32 import java.util.List;
33
34 import javax.annotation.Nullable;
35
36 /**
37 * An immutable well-formed internet domain name, such as {@code com} or {@code
38 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
39 * network interactions take place. Thus there is no guarantee that the domain
40 * actually exists on the internet.
41 *
42 * <p>One common use of this class is to determine whether a given string is
43 * likely to represent an addressable domain on the web -- that is, for a
44 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
45 * result in a webpage being displayed? In the past, this test was frequently
46 * done by determining whether the domain ended with a {@linkplain
47 * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
48 * this test is no longer accurate. There are many domains which are both public
49 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
50 * result, the only useful test to determine if a domain is a plausible web host
51 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
52 * which (currently) are not hosts, such as {@code "com"}, but given that any
53 * public suffix may become a host without warning, it is better to err on the
54 * side of permissiveness and thus avoid spurious rejection of valid sites.
55 *
56 * <p>During construction, names are normalized in two ways:
57 * <ol>
58 * <li>ASCII uppercase characters are converted to lowercase.
59 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
60 * converted to the ASCII period.
61 * </ol>
62 * <p>The normalized values will be returned from {@link #toString()} and
63 * {@link #parts()}, and will be reflected in the result of
64 * {@link #equals(Object)}.
65 *
66 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
67 * Internationalized domain names</a> such as {@code 网络.cn} are supported, as
68 * are the equivalent <a
69 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
70 * Punycode-encoded</a> versions.
71 *
72 * @author Craig Berry
73 * @since 5.0
74 */
75 @Beta
76 @GwtCompatible
77 public final class InternetDomainName {
78
79 private static final CharMatcher DOTS_MATCHER =
80 CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
81 private static final Splitter DOT_SPLITTER = Splitter.on('.');
82 private static final Joiner DOT_JOINER = Joiner.on('.');
83
84 /**
85 * Value of {@link #publicSuffixIndex} which indicates that no public suffix
86 * was found.
87 */
88 private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
89
90 private static final String DOT_REGEX = "\\.";
91
92 /**
93 * Maximum parts (labels) in a domain name. This value arises from
94 * the 255-octet limit described in
95 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
96 * the fact that the encoding of each part occupies at least two bytes
97 * (dot plus label externally, length byte plus label internally). Thus, if
98 * all labels have the minimum size of one byte, 127 of them will fit.
99 */
100 private static final int MAX_PARTS = 127;
101
102 /**
103 * Maximum length of a full domain name, including separators, and
104 * leaving room for the root label. See
105 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106 */
107 private static final int MAX_LENGTH = 253;
108
109 /**
110 * Maximum size of a single part of a domain name. See
111 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112 */
113 private static final int MAX_DOMAIN_PART_LENGTH = 63;
114
115 /**
116 * The full domain name, converted to lower case.
117 */
118 private final String name;
119
120 /**
121 * The parts of the domain name, converted to lower case.
122 */
123 private final ImmutableList<String> parts;
124
125 /**
126 * The index in the {@link #parts()} list at which the public suffix begins.
127 * For example, for the domain name {@code www.google.co.uk}, the value would
128 * be 2 (the index of the {@code co} part). The value is negative
129 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130 * found.
131 */
132 private final int publicSuffixIndex;
133
134 /**
135 * Constructor used to implement {@link #from(String)}, and from subclasses.
136 */
137 InternetDomainName(String name) {
138 // Normalize:
139 // * ASCII characters to lowercase
140 // * All dot-like characters to '.'
141 // * Strip trailing '.'
142
143 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144
145 if (name.endsWith(".")) {
146 name = name.substring(0, name.length() - 1);
147 }
148
149 checkArgument(name.length() <= MAX_LENGTH,
150 "Domain name too long: '%s':", name);
151 this.name = name;
152
153 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
154 checkArgument(parts.size() <= MAX_PARTS,
155 "Domain has too many parts: '%s'", name);
156 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
157
158 this.publicSuffixIndex = findPublicSuffix();
159 }
160
161 /**
162 * Returns the index of the leftmost part of the public suffix, or -1 if not
163 * found. Note that the value defined as the "public suffix" may not be a
164 * public suffix according to {@link #isPublicSuffix()} if the domain ends
165 * with an excluded domain pattern such as {@code "nhs.uk"}.
166 */
167 private int findPublicSuffix() {
168 final int partsSize = parts.size();
169
170 for (int i = 0; i < partsSize; i++) {
171 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
172
173 if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) {
174 return i;
175 }
176
177 // Excluded domains (e.g. !nhs.uk) use the next highest
178 // domain as the effective public suffix (e.g. uk).
179
180 if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) {
181 return i + 1;
182 }
183
184 if (matchesWildcardPublicSuffix(ancestorName)) {
185 return i;
186 }
187 }
188
189 return NO_PUBLIC_SUFFIX_FOUND;
190 }
191
192 /**
193 * Returns an instance of {@link InternetDomainName} after lenient
194 * validation. Specifically, validation against <a
195 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
196 * ("Internationalizing Domain Names in Applications") is skipped, while
197 * validation against <a
198 * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
199 * the following ways:
200 * <ul>
201 * <li>Any part containing non-ASCII characters is considered valid.
202 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
203 * <li>Parts other than the final part may start with a digit.
204 * </ul>
205 *
206 *
207 * @param domain A domain name (not IP address)
208 * @throws IllegalArgumentException if {@code name} is not syntactically valid
209 * according to {@link #isValid}
210 * @since 10.0 (previously named {@code fromLenient})
211 */
212 public static InternetDomainName from(String domain) {
213 return new InternetDomainName(checkNotNull(domain));
214 }
215
216 /**
217 * Validation method used by {@from} to ensure that the domain name is
218 * syntactically valid according to RFC 1035.
219 *
220 * @return Is the domain name syntactically valid?
221 */
222 private static boolean validateSyntax(List<String> parts) {
223 final int lastIndex = parts.size() - 1;
224
225 // Validate the last part specially, as it has different syntax rules.
226
227 if (!validatePart(parts.get(lastIndex), true)) {
228 return false;
229 }
230
231 for (int i = 0; i < lastIndex; i++) {
232 String part = parts.get(i);
233 if (!validatePart(part, false)) {
234 return false;
235 }
236 }
237
238 return true;
239 }
240
241 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
242
243 private static final CharMatcher PART_CHAR_MATCHER =
244 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
245
246 /**
247 * Helper method for {@link #validateSyntax(List)}. Validates that one part of
248 * a domain name is valid.
249 *
250 * @param part The domain name part to be validated
251 * @param isFinalPart Is this the final (rightmost) domain part?
252 * @return Whether the part is valid
253 */
254 private static boolean validatePart(String part, boolean isFinalPart) {
255
256 // These tests could be collapsed into one big boolean expression, but
257 // they have been left as independent tests for clarity.
258
259 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
260 return false;
261 }
262
263 /*
264 * GWT claims to support java.lang.Character's char-classification methods,
265 * but it actually only works for ASCII. So for now, assume any non-ASCII
266 * characters are valid. The only place this seems to be documented is here:
267 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
268 *
269 * <p>ASCII characters in the part are expected to be valid per RFC 1035,
270 * with underscore also being allowed due to widespread practice.
271 */
272
273 String asciiChars = CharMatcher.ASCII.retainFrom(part);
274
275 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
276 return false;
277 }
278
279 // No initial or final dashes or underscores.
280
281 if (DASH_MATCHER.matches(part.charAt(0))
282 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
283 return false;
284 }
285
286 /*
287 * Note that we allow (in contravention of a strict interpretation of the
288 * relevant RFCs) domain parts other than the last may begin with a digit
289 * (for example, "3com.com"). It's important to disallow an initial digit in
290 * the last part; it's the only thing that stops an IPv4 numeric address
291 * like 127.0.0.1 from looking like a valid domain name.
292 */
293
294 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
295 return false;
296 }
297
298 return true;
299 }
300
301 /**
302 * Returns the individual components of this domain name, normalized to all
303 * lower case. For example, for the domain name {@code mail.google.com}, this
304 * method returns the list {@code ["mail", "google", "com"]}.
305 */
306 public ImmutableList<String> parts() {
307 return parts;
308 }
309
310 /**
311 * Indicates whether this domain name represents a <i>public suffix</i>, as
312 * defined by the Mozilla Foundation's
313 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
314 * suffix is one under which Internet users can directly register names, such
315 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
316 * names that are <i>not</i> public suffixes include {@code google}, {@code
317 * google.com} and {@code foo.co.uk}.
318 *
319 * @return {@code true} if this domain name appears exactly on the public
320 * suffix list
321 * @since 6.0
322 */
323 public boolean isPublicSuffix() {
324 return publicSuffixIndex == 0;
325 }
326
327 /**
328 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
329 * public suffix}, including if it is a public suffix itself. For example,
330 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
331 * {@code com}, but not for {@code google} or {@code google.foo}. This is
332 * the recommended method for determining whether a domain is potentially an
333 * addressable host.
334 *
335 * @since 6.0
336 */
337 public boolean hasPublicSuffix() {
338 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
339 }
340
341 /**
342 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
343 * domain name, or {@code null} if no public suffix is present.
344 *
345 * @since 6.0
346 */
347 public InternetDomainName publicSuffix() {
348 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
349 }
350
351 /**
352 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
353 * public suffix}, while not being a public suffix itself. For example,
354 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
355 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
356 * google.foo}.
357 *
358 * <p><b>Warning:</b> a {@code false} result from this method does not imply
359 * that the domain does not represent an addressable host, as many public
360 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
361 * that test.
362 *
363 * <p>This method can be used to determine whether it will probably be
364 * possible to set cookies on the domain, though even that depends on
365 * individual browsers' implementations of cookie controls. See
366 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
367 *
368 * @since 6.0
369 */
370 public boolean isUnderPublicSuffix() {
371 return publicSuffixIndex > 0;
372 }
373
374 /**
375 * Indicates whether this domain name is composed of exactly one subdomain
376 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
377 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
378 * but not for {@code www.google.com} or {@code co.uk}.
379 *
380 * <p><b>Warning:</b> A {@code true} result from this method does not imply
381 * that the domain is at the highest level which is addressable as a host, as
382 * many public suffixes are also addressable hosts. For example, the domain
383 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
384 * return {@code true} from this method. But {@code uk.com} is itself an
385 * addressable host.
386 *
387 * <p>This method can be used to determine whether a domain is probably the
388 * highest level for which cookies may be set, though even that depends on
389 * individual browsers' implementations of cookie controls. See
390 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
391 *
392 * @since 6.0
393 */
394 public boolean isTopPrivateDomain() {
395 return publicSuffixIndex == 1;
396 }
397
398 /**
399 * Returns the portion of this domain name that is one level beneath the
400 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
401 * {@code google.co.uk}, since {@code co.uk} is a public suffix.
402 *
403 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
404 * instance is returned.
405 *
406 * <p>This method should not be used to determine the topmost parent domain
407 * which is addressable as a host, as many public suffixes are also
408 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
409 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
410 * from this method. But {@code uk.com} is itself an addressable host.
411 *
412 * <p>This method can be used to determine the probable highest level parent
413 * domain for which cookies may be set, though even that depends on individual
414 * browsers' implementations of cookie controls.
415 *
416 * @throws IllegalStateException if this domain does not end with a
417 * public suffix
418 * @since 6.0
419 */
420 public InternetDomainName topPrivateDomain() {
421 if (isTopPrivateDomain()) {
422 return this;
423 }
424 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
425 return ancestor(publicSuffixIndex - 1);
426 }
427
428 /**
429 * Indicates whether this domain is composed of two or more parts.
430 */
431 public boolean hasParent() {
432 return parts.size() > 1;
433 }
434
435 /**
436 * Returns an {@code InternetDomainName} that is the immediate ancestor of
437 * this one; that is, the current domain with the leftmost part removed. For
438 * example, the parent of {@code www.google.com} is {@code google.com}.
439 *
440 * @throws IllegalStateException if the domain has no parent, as determined
441 * by {@link #hasParent}
442 */
443 public InternetDomainName parent() {
444 checkState(hasParent(), "Domain '%s' has no parent", name);
445 return ancestor(1);
446 }
447
448 /**
449 * Returns the ancestor of the current domain at the given number of levels
450 * "higher" (rightward) in the subdomain list. The number of levels must be
451 * non-negative, and less than {@code N-1}, where {@code N} is the number of
452 * parts in the domain.
453 *
454 * <p>TODO: Reasonable candidate for addition to public API.
455 */
456 private InternetDomainName ancestor(int levels) {
457 return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
458 }
459
460 /**
461 * Creates and returns a new {@code InternetDomainName} by prepending the
462 * argument and a dot to the current name. For example, {@code
463 * InternetDomainName.from("foo.com").child("www.bar")} returns a new
464 * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
465 * lenient validation is performed, as described {@link #from(String) here}.
466 *
467 * @throws NullPointerException if leftParts is null
468 * @throws IllegalArgumentException if the resulting name is not valid
469 */
470 public InternetDomainName child(String leftParts) {
471 return from(checkNotNull(leftParts) + "." + name);
472 }
473
474 /**
475 * Indicates whether the argument is a syntactically valid domain name using
476 * lenient validation. Specifically, validation against <a
477 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
478 * ("Internationalizing Domain Names in Applications") is skipped.
479 *
480 * <p>The following two code snippets are equivalent:
481 *
482 * <pre> {@code
483 * domainName = InternetDomainName.isValid(name)
484 * ? InternetDomainName.from(name)
485 * : DEFAULT_DOMAIN;}</pre>
486 *
487 * <pre> {@code
488 * try {
489 * domainName = InternetDomainName.from(name);
490 * } catch (IllegalArgumentException e) {
491 * domainName = DEFAULT_DOMAIN;
492 * }}</pre>
493 *
494 * @since 8.0 (previously named {@code isValidLenient})
495 */
496 public static boolean isValid(String name) {
497 try {
498 from(name);
499 return true;
500 } catch (IllegalArgumentException e) {
501 return false;
502 }
503 }
504
505 /**
506 * Does the domain name match one of the "wildcard" patterns (e.g.
507 * {@code "*.ar"})?
508 */
509 private static boolean matchesWildcardPublicSuffix(String domain) {
510 final String[] pieces = domain.split(DOT_REGEX, 2);
511 return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]);
512 }
513
514 /**
515 * Returns the domain name, normalized to all lower case.
516 */
517 @Override
518 public String toString() {
519 return name;
520 }
521
522 /**
523 * Equality testing is based on the text supplied by the caller,
524 * after normalization as described in the class documentation. For
525 * example, a non-ASCII Unicode domain name and the Punycode version
526 * of the same domain name would not be considered equal.
527 *
528 */
529 @Override
530 public boolean equals(@Nullable Object object) {
531 if (object == this) {
532 return true;
533 }
534
535 if (object instanceof InternetDomainName) {
536 InternetDomainName that = (InternetDomainName) object;
537 return this.name.equals(that.name);
538 }
539
540 return false;
541 }
542
543 @Override
544 public int hashCode() {
545 return name.hashCode();
546 }
547 }